Predict survival on Titanic dataset
The sinking of the RMS Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. This sensational tragedy shocked the international community and led to better safety regulations for ships.
One of the reasons that the shipwreck led to such loss of life was that there were not enough lifeboats for the passengers and crew. Although there was some element of luck involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class.
In this challenge, we ask you to complete the analysis of what sorts of people were likely to survive. In particular, we ask you to apply the tools of machine learning to predict which passengers survived the tragedy. https://www.kaggle.com/c/titanic
In [ ]:
In [13]:
import warnings
warnings.filterwarnings('ignore')
# SKLearn Model Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression , Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
# SKLearn ensemble classifiers
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier , BaggingClassifier
from sklearn.ensemble import VotingClassifier , AdaBoostClassifier
# SKLearn Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
# Handle table-like data and matrices
import numpy as np
import pandas as pd
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# plot functions
import pltFunctions as pfunc
# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
In [14]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")
In [15]:
#combined = pd.concat([train.drop('Survived',1),test])
#combined = train.append( test, ignore_index = True)
full = train.append( test, ignore_index = True)
del train, test
#train = full[ :891 ]
#combined = combined.drop( 'Survived',1)
In [16]:
#print ('Datasets:' , 'combined:' , combined.shape , 'full:' , full.shape , 'train:' , train.shape)
In [17]:
full.head(10)
Out[17]:
In [18]:
print(full.isnull().sum())
In [19]:
pd.crosstab(full['Pclass'], full['Sex'])
Out[19]:
In [20]:
print( full.groupby(['Sex','Pclass'])['Age'].mean() )
agedf = full.groupby(['Sex','Pclass'])['Age'].mean()
type( agedf )
Out[20]:
In [21]:
#for age in full:
# if full['Age'].isnull():
# print (agedf.where(agedf['Sex'] == full['Sex'])&(agedf['Pclass']==full['Pclass']))
In [22]:
def fillMissingAge(dframe):
dframe['Age'] = dframe['Age'].fillna( dframe['Age'].mean())
return dframe
def fillMissingFare(dframe):
dframe['Fare'] = dframe['Fare'].fillna( dframe['Fare'].mean() )
return dframe
In [23]:
full = fillMissingAge(full)
full = fillMissingFare(full)
print(full.isnull().sum())
In [ ]:
In [24]:
print(full[full['Embarked'].isnull()])
In [25]:
pd.crosstab(full['Embarked'], full['Sex'].where(full['Sex'] == 1))
Out[25]:
In [26]:
full.where((full['Sex']==1) & (full['Pclass']==1)).groupby(['Embarked','Pclass','Parch','SibSp']).size()
Out[26]:
In [27]:
nt=(115+60+291)
pC=115/nt
pQ=60/nt
pS=291/nt
print('Prob C :', pC, 'Prob Q :', pQ ,'Prob S :' , pS)
nC=(30+2+20)
p0C=30/nC
p0Q=2/nC
p0S=20/nC
print('Prob C :', p0C, 'Prob Q :', p0Q ,'Prob S :' , p0S)
print( 'Sum of probabilities')
print('Prob C :', pC+p0C, 'Prob Q :', pQ+p0Q ,'Prob S :' , pS+p0S)
In [28]:
# Trying S for both passengers
full['Embarked'].iloc[61] = "S"
full['Embarked'].iloc[829] = "S"
In [29]:
print(full.isnull().sum())
In [30]:
def fillCabin(dframe):
dframe[ 'Cabin' ] = dframe['Cabin'].fillna( 'U' )
dframe[ 'Cabin' ] = dframe[ 'Cabin' ].map( lambda c : c[0] )
# dummy encoding ...
dframe = pd.get_dummies( dframe['Cabin'] , prefix = 'Cabin' )
return dframe
In [31]:
print(fillCabin(full))
newDF = fillCabin(full)
full = pd.concat([full, newDF], axis=1)
#full = full.drop('Cabin',1)
In [32]:
full
Out[32]:
In [33]:
#print( full.where((full['Sex'] == 0) & (full['Pclass'] == 1)).groupby(['Pclass','Sex'])['Age'].mean() )
print( full['Sex'].isnull().sum() )
In [ ]:
In [34]:
#byTicket = full.where(full['Cabin'].isnull()).groupby(['Name'])['Ticket']
#byFare = full.where(full['Cabin'].isnull()).groupby(['Pclass'])['Fare']
#byTicket.head(5)
#byFare.head(5)
In [35]:
full = pfunc.convertSexToNum(full)
full.head()
Out[35]:
In [36]:
# Naming the Deck accordingly to the Cabin description
# Naming the Deck as U due to unknown Cabin description
full = pfunc.fillDeck(full)
pd.crosstab(full['Deck'], full['Survived'])
Out[36]:
In [37]:
print(full.isnull().sum())
print("========================================")
print(full.info())
In [38]:
print(pfunc.featureEng( full ))
full = pfunc.featureEng( full )
In [39]:
#pfunc.pltCorrel( combined )
#pfunc.pltCorrel( full )
#pfunc.pltCorrel( full )
Pclass is correlated to Fare ( 1st class tickets would be more expensive than other classes )
Pclass x Age
SibSp X Age
SibSp x Fare
SibSp is correlate to Parch ( large families would have high values of parents aboard and solo travellers would have zero parents aboard )
Pclass noticeable correlates to Survived ( Expected correlation with higher classes to survive as known )
In [40]:
# Plot distributions of Age of passangers who survived or did not survive
#pfunc.pltDistro( train , var = 'Age' , target = 'Survived' , row = 'Sex' )
In [41]:
# Plot distributions of Fare of passangers who survived or did not survive
#pfunc.pltDistro( train , var = 'Survived' , target = 'Pclass' , row = 'Sex' )
In [42]:
# Plot distributions of Parch of passangers who survived or did not survive
#pfunc.pltDistro( train , var = 'Parch' , target = 'Survived' , row = 'Sex' )
In [43]:
full.head(5)
Out[43]:
In [49]:
# Plot distributions of Age of passangers who survived or did not survive
#pfunc.pltCategories( train , cat = 'Embarked' , target = 'Survived' )
#pfunc.pltCategories( train , cat = 'Pclass' , target = 'Survived' )
#pfunc.pltCategories( train , cat = 'Sex' , target = 'Survived' )
#pfunc.pltCategories( train , cat = 'Parch' , target = 'Survived' )
#pfunc.pltCategories( train , cat = 'SibSp' , target = 'Survived' )
#pfunc.pltDistro( train , var = 'Age' , target = 'Survived' , row = 'Sex' )
full = full.drop('Survived',1)
In [ ]:
def getTitles(dframe):
dframe['Title'] = dframe['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
myDict = { "Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Dr": "Officer",
"Rev": "Officer",
"Lady" : "Royalty",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir" : "Royalty",
"the Countess":"Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr" : "Mr",
"Mrs" : "Mrs",
"Miss" : "Miss",
"Master" : "Master"
}
dframe['Title'] = dframe.Title.map(myDict)
return dframe
In [57]:
full = getTitles(full)
full.head()
Out[57]:
In [56]:
# plot functions
import pltFunctions as pfunc
train_X, test_X, target_y = pfunc.prepareTrainTestTarget(full)
#train_valid_X = full[ 0:891 ]
#train_valid_y = full.Survived
#test_X = full[ 891: ]
#train_X , valid_X , train_y , valid_y = train_test_split( train_X , train_valid_y , train_size = .7 )
print (full.shape , train_X.shape , target_y.shape , test_X.shape)
In [51]:
model = RandomForestClassifier(n_estimators=100)
#model = SVC()
model.fit( train_X , target_y )
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: